library(AppliedPredictiveModeling) # Data sets
library(tidyverse) # Oppan tidy style
[30m── [1mAttaching packages[22m ───────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──[39m
[30m[32m✔[30m [34mggplot2[30m 2.2.1 [32m✔[30m [34mpurrr [30m 0.2.4
[32m✔[30m [34mtibble [30m 1.4.2 [32m✔[30m [34mdplyr [30m 0.7.5
[32m✔[30m [34mtidyr [30m 0.8.1 [32m✔[30m [34mstringr[30m 1.3.1
[32m✔[30m [34mreadr [30m 1.1.1 [32m✔[30m [34mforcats[30m 0.3.0[39m
[30m── [1mConflicts[22m ──────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31m✖[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()[39m
library(caret) # Modeling
Loading required package: lattice
Attaching package: ‘caret’
The following object is masked from ‘package:purrr’:
lift
library(e1071) # skewness
Error in library(e1071) : there is no package called ‘e1071’
rr data(segmentationOriginal) segmentationOriginal <- as_tibble(segmentationOriginal) segmentationOriginal
rr seg_data <- subset(segmentationOriginal, Case == ) seg_data
rr cell_id <- seg_data\(Case class <- seg_data\)Class case <- seg_data$Case seg_data <- seg_data[, -(1:3)] seg_data %>% select(-contains()) -> seg_data seg_data
Skewness
rr library(e1071) skewness(seg_data$AngleCh1)
[1] -0.02426252
rr #seg_data %>% map_dfr(skewness) summarize_all(seg_data, skewness)
Box-Cox transform
rr Ch1AreaTrans <- BoxCoxTrans(seg_data$AreaCh1) Ch1AreaTrans
Box-Cox Transformation
1009 data points used to estimate Lambda
Input data summary:
Min. 1st Qu. Median Mean 3rd Qu. Max.
150.0 194.0 256.0 325.1 376.0 2186.0
Largest/Smallest: 14.6
Sample Skewness: 3.53
Estimated Lambda: -0.9
Apply the transform with the predict function
rr predict(Ch1AreaTrans, head(seg_data$AreaCh1)) -> dat dat
[1] 1.108458 1.106383 1.104520 1.103554 1.103607 1.105523
Or perform it all at once via caret::preProcess
rr percent_variance[1:3]
[1] 20.91236 17.01330 11.88689
Near zero variance
rr nearZeroVar(seg_data)
integer(0)
Correlations
rr correlations <- cor(seg_data) dim(correlations)
[1] 58 58
rr correlations[1:4, 1:4]
AngleCh1 AreaCh1 AvgIntenCh1 AvgIntenCh2
AngleCh1 1.000000000 -0.002627172 -0.04300776 -0.01944681
AreaCh1 -0.002627172 1.000000000 -0.02529739 -0.15330301
AvgIntenCh1 -0.043007757 -0.025297394 1.00000000 0.52521711
AvgIntenCh2 -0.019446810 -0.153303007 0.52521711 1.00000000
3.1
library(mlbench)
data(Glass)
str(Glass)
'data.frame': 214 obs. of 10 variables:
$ RI : num 1.52 1.52 1.52 1.52 1.52 ...
$ Na : num 13.6 13.9 13.5 13.2 13.3 ...
$ Mg : num 4.49 3.6 3.55 3.69 3.62 3.61 3.6 3.61 3.58 3.6 ...
$ Al : num 1.1 1.36 1.54 1.29 1.24 1.62 1.14 1.05 1.37 1.36 ...
$ Si : num 71.8 72.7 73 72.6 73.1 ...
$ K : num 0.06 0.48 0.39 0.57 0.55 0.64 0.58 0.57 0.56 0.57 ...
$ Ca : num 8.75 7.83 7.78 8.22 8.07 8.07 8.17 8.24 8.3 8.4 ...
$ Ba : num 0 0 0 0 0 0 0 0 0 0 ...
$ Fe : num 0 0 0 0 0 0.26 0 0 0 0.11 ...
$ Type: Factor w/ 6 levels "1","2","3","5",..: 1 1 1 1 1 1 1 1 1 1 ...
Predictor variables
map_if(Glass, .p = is.numeric, .f = BoxCoxTrans) %>% map("lambda")
$RI
[1] -2
$Na
[1] -0.1
$Mg
[1] NA
$Al
[1] 0.5
$Si
[1] 2
$K
[1] NA
$Ca
[1] -1.1
$Ba
[1] NA
$Fe
[1] NA
$Type
NULL